#!/anaconda3/bin/python

# Copyright 2019 Transparency in Algorithms Group, RISE, Nicosia, Cyprus
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
# the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.

# This script was created by Transparency in Algorithms Group, RISE, Nicosia, Cyprus for research purposes. This script
# calculates and exports the vectors of the proportion of frequency of words found per cluster, given an input file
# in CSV format, offering the target along with a set of words per record.

# Usage: python count_categories.py --input '/path/to/an/example_input_file.csv' > example_vectors.csv
# or:    ./count_categories.py --input '/path/to/an/example_input_file.csv' > example_vectors.csv

import os
import sys
import argparse

# Global Dictionary Lists

# super-clusters list of labels
demographic = []
concrete = []
abstract = []
other = []
inflammatory_all = []

# sub-clusters list of labels
age = []
feminine = []
masculine = []
nonbinary = []
race = []
action = []
body = []
hair = []
clothing = []
meta = []
colors = []
shape = []
judgement = []
traits = []
emotion = []
occupation = []
ambiguous = []
lack_of = []
misc = []
inconclusive = []
inflammatory = []


# Loads a specific dictionary given a csv filepath and returns the dictionary in a list of words.
def load_dictionary(filepath):

    tags = list()

    if not os.path.isfile(filepath):
        print("File path {} does not exist. Exiting...".format(filepath))
        sys.exit()

    with open(filepath) as fp:
        for line in fp:
            tags.append(line.strip())

    return tags


# Given a set of words, it counts the super and sub-clusters of demographic related words returning the proportion of
# frequency for each super and sub-cluster.
def demographic_words(words):
    # sub-clusters
    global age
    global feminine
    global masculine
    global nonbinary
    global race
    # super-cluster
    global demographic

    gender_words = 0
    age_words = 0
    demographic_w = 0
    feminine_words = 0
    masculine_words = 0
    race_words = 0
    nonbinary_words = 0

    # if it's empty or null initialize everything
    if not demographic:
        # Note: Uncomment the following section and comment the feminine, masculine and nonbinary sub-categories if
        # you want to handle them as one.
        #
        # gender = load_dictionary("dict/gender.csv")

        age = load_dictionary("dict/age.csv")

        feminine = load_dictionary("dict/feminine.csv")

        masculine = load_dictionary("dict/masculine.csv")

        # This is an empty category that it was found in the human data collected, but not in the APIs output. This is
        # only for future reference.
        nonbinary = load_dictionary("dict/nonbinary.csv")

        race = load_dictionary("dict/race.csv")

        # "Super-list" of demographic words
        # Note: Put gender in the following union if you want to handle them as one.
        demographic = list(set().union(age, feminine, masculine, nonbinary, race))

    for word in words:
        demographic_w += demographic.count(word.lower())
        age_words += age.count(word.lower())
        feminine_words += feminine.count(word.lower())
        masculine_words += masculine.count(word.lower())
        nonbinary_words += nonbinary.count(word.lower())
        race_words += race.count(word.lower())
        # Note: Uncomment the following section and comment the feminine, masculine and nonbinary sub-categories if
        # you want to handle them as one.
        #
        # gender_words += gender.count(word.lower())


    # Calculate the proportion of words, not raw wordcount
    demographic_w = demographic_w / len(words)
    gender_words = gender_words / len(words)
    age_words = age_words / len(words)
    feminine_words = feminine_words / len(words)
    masculine_words = masculine_words / len(words)
    nonbinary_words = nonbinary_words / len(words)
    race_words = race_words / len(words)

    # Note: Put gender in the following return statement if you want to handle them as one.
    return demographic_w, age_words, feminine_words, masculine_words, nonbinary_words, race_words


# Given a set of words, it counts the super and sub-clusters of concrete related words returning the proportion of
# frequency for each super and sub-cluster.
def concrete_words(words):
    # sub-clusters
    global action
    global body
    global hair
    global clothing
    global meta
    global colors
    global shape
    # super-clusters
    global concrete

    action_words = 0
    body_words = 0
    hair_words = 0
    clothing_words = 0
    meta_words = 0
    color_words = 0
    shape_words = 0
    concrete_w = 0

    # if it's empty or null initialize everything
    if not concrete:
        action = load_dictionary("dict/actions.csv")

        body = load_dictionary("dict/body.csv")

        hair = load_dictionary("dict/hair.csv")

        clothing = load_dictionary("dict/clothing.csv")

        meta = load_dictionary("dict/meta.csv")

        colors = load_dictionary("dict/colors.csv")

        shape = load_dictionary("dict/shape.csv")

        # Make the "super-list" of concrete words
        concrete = list(set().union(action, body, hair, clothing, meta, colors, shape))

    for word in words:
        concrete_w += concrete.count(word.lower())
        action_words += action.count(word.lower())
        body_words += body.count(word.lower())
        hair_words += hair.count(word.lower())
        clothing_words += clothing.count(word.lower())
        meta_words += meta.count(word.lower())
        color_words += colors.count(word.lower())
        shape_words += shape.count(word.lower())

    # Calculate the proportion of words, not raw wordcount
    concrete_w = concrete_w / len(words)
    action_words = action_words / len(words)
    body_words = body_words / len(words)
    hair_words = hair_words / len(words)
    clothing_words = clothing_words / len(words)
    meta_words = meta_words / len(words)
    color_words = color_words / len(words)
    shape_words = shape_words / len(words)

    return action_words, body_words, hair_words, clothing_words, meta_words, color_words, shape_words, concrete_w


# Given a set of words, it counts the super and sub-clusters of abstract related words returning the proportion of
# frequency for each super and sub-cluster.
def abstract_words(words):
    # sub-clusters
    global judgement
    global traits
    global emotion
    global occupation
    # super-cluster
    global abstract

    judgement_words = 0
    traits_words = 0
    emotion_words = 0
    occupation_words = 0
    abstract_w = 0

    # if it's empty or null initialize everything
    if not abstract:
        judgement = load_dictionary("dict/judgement.csv")

        traits = load_dictionary("dict/traits.csv")

        emotion = load_dictionary("dict/emotion.csv")

        occupation = load_dictionary("dict/occupation.csv")

        # Make the "super-list" of demographic words
        abstract = list(set().union(judgement, traits, emotion, occupation))

    for word in words:
        judgement_words += judgement.count(word.lower())
        traits_words += traits.count(word.lower())
        emotion_words += emotion.count(word.lower())
        occupation_words += occupation.count(word.lower())
        abstract_w += abstract.count(word.lower())

    # Calculate the proportion of words, not raw wordcount
    judgement_words = judgement_words / len(words)
    traits_words = traits_words / len(words)
    emotion_words = emotion_words / len(words)
    occupation_words = occupation_words / len(words)
    abstract_w = abstract_w / len(words)

    return judgement_words, traits_words, emotion_words, occupation_words, abstract_w


# Given a set of words, it counts the super and sub-clusters of "other" related words returning the proportion of
# frequency for each super and sub-cluster.
def other_words(words):
    # sub-clusters
    global ambiguous
    global lack_of
    global misc
    global inconclusive
    # super-cluster
    global other

    other_w = 0
    ambiguous_words = 0
    lack_of_words = 0
    misc_words = 0
    inconclusive_words = 0

    # if it's empty or null initialize everything
    if not other:
        ambiguous = load_dictionary("dict/body.csv")

        lack_of = load_dictionary("dict/lack.csv")

        misc = load_dictionary("dict/misc.csv")

        # Note: This is an empty category that it was found in the human data collected, but not in the APIs output. This is
        # only for future reference.
        inconclusive = load_dictionary("dict/inconclusive.csv")

        # Make the "super-list" of other words
        other = list(set().union(ambiguous, lack_of, misc))

    for word in words:
        other_w += other.count(word.lower())
        ambiguous_words += ambiguous.count(word.lower())
        lack_of_words += lack_of.count(word.lower())
        misc_words += misc.count(word.lower())
        inconclusive_words += inconclusive.count(word.lower())

    # Calculate the proportion of words, not raw wordcount
    other_w = other_w / len(words)
    ambiguous_words = ambiguous_words / len(words)
    lack_of_words = lack_of_words / len(words)
    misc_words = misc_words / len(words)
    inconclusive_words = inconclusive_words / len(words)

    return ambiguous_words, lack_of_words, misc_words, inconclusive_words, other_w


# Given a set of words, it counts the super and sub-clusters of inflammatory related words returning the proportion of
# frequency for each super and sub-cluster.
def inflammatory_words(words):
    # sub-cluster
    global inflammatory
    # super-cluster
    global inflammatory_all

    inflammatory_w = 0

    # if it's empty or null initialize everything
    if not inflammatory_all:
        inflammatory = load_dictionary("dict/inflammatory.csv")

        # Make the "super-list" of inflammatory words
        inflammatory_all = list(set().union(inflammatory))

    for word in words:
        inflammatory_w += inflammatory_all.count(word.lower())

    inflammatory_w = inflammatory_w / len(words)

    return inflammatory_w


if __name__ == '__main__':
    # CLI Options
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', default=None, help='Path to the input file')
    args = parser.parse_args()

    # Processing the input file

    DELIMITER = ','

    # Check for unset input filepath option
    if args.input is None:
        print('[!] Input option is not set! Please provide one to continue.')
        exit()

    filepath = args.input

    if not os.path.isfile(filepath):
        print("File path {} does not exist. Exiting...".format(filepath))
        sys.exit()

    # Print Header
    # Note: Put gender in the following print statement if you want to handle masculine, feminine and nonbinary as one.
    print(
        "Target,"
        "DEMOGRAPHIC,Masculine,Feminine,Nonbinary,Age,Race,"
        "CONCRETE,Action,Body,Hair,Clothing,Color,Meta,Shape,"
        "ABSTRACT,Judgement,Traits,Emotion,Occupation,"
        "INFLAMMATORY,"
        "OTHER,Ambiguous,Inconclusive,Lack,Misc")

    with open(filepath) as fp:
        cnt = 0
        for line in fp:

            # Tokenize
            labels = line.strip().split(DELIMITER)

            if cnt >= 1:
                # Demographic words
                (demographic_w, age_words, feminine_words, masculine_words, nonbinary_words, race_words) = demographic_words(labels)

                # Concrete words
                (action_words, body_words, hair_words, clothing_words, meta_words, color_words, shape_words, concrete_w) = concrete_words(labels)

                # Abstract words
                (judgement_words, traits_words, emotion_words, occupation_words, abstract_w) = abstract_words(labels)

                # Inflammatory words
                (inflammatory_w) = inflammatory_words(labels)

                # Other words
                (ambiguous_words, lack_of_words, misc_words, inconclusive_words, other_w) = other_words(labels)

                print("{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}".format(
                    line.strip().split(',')[0],
                    demographic_w, masculine_words, feminine_words, nonbinary_words, age_words, race_words,
                    concrete_w, action_words, body_words, hair_words, clothing_words, color_words, meta_words, shape_words,
                    abstract_w, judgement_words, traits_words, emotion_words, occupation_words,
                    inflammatory_w,
                    other_w, ambiguous_words, inconclusive_words, lack_of_words, misc_words))

            cnt += 1
